NUPUR HIRAY¶

Data Science Intern at CodeClause

CODECLAUSE PROJECT 1 : Churn Prediction in Telecom Industry Using Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import plotly.express as px
In [2]:
#INPORTING DATASET
data1 = pd.read_csv("C:/Users/Nupur/Documents/A INTERNSHIP/Telco data.csv")
In [3]:
data1.head()
Out[3]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

5 rows × 21 columns

In [4]:
data1.shape
Out[4]:
(7043, 21)
In [5]:
data1.describe()
Out[5]:
SeniorCitizen tenure MonthlyCharges
count 7043.000000 7043.000000 7043.000000
mean 0.162147 32.371149 64.761692
std 0.368612 24.559481 30.090047
min 0.000000 0.000000 18.250000
25% 0.000000 9.000000 35.500000
50% 0.000000 29.000000 70.350000
75% 0.000000 55.000000 89.850000
max 1.000000 72.000000 118.750000
In [6]:
# Checking Null values
data1.notnull().sum()
Out[6]:
customerID          7043
gender              7043
SeniorCitizen       7043
Partner             7043
Dependents          7043
tenure              7043
PhoneService        7043
MultipleLines       7043
InternetService     7043
OnlineSecurity      7043
OnlineBackup        7043
DeviceProtection    7043
TechSupport         7043
StreamingTV         7043
StreamingMovies     7043
Contract            7043
PaperlessBilling    7043
PaymentMethod       7043
MonthlyCharges      7043
TotalCharges        7043
Churn               7043
dtype: int64
In [7]:
#There is no missing value in our dataset
%matplotlib inline
In [8]:
data_hist = px.histogram(data1, x='gender',color='Churn',marginal='box', color_discrete_sequence = ['darksalmon','dimgrey'])
data_hist.update_layout(bargap=0.2)
In [9]:
plt.bar(data1['gender'],data1['Churn'])
Out[9]:
<BarContainer object of 7043 artists>
In [10]:
data1.hist(bins = 30, figsize=(20,15))
Out[10]:
array([[<AxesSubplot:title={'center':'SeniorCitizen'}>,
        <AxesSubplot:title={'center':'tenure'}>],
       [<AxesSubplot:title={'center':'MonthlyCharges'}>, <AxesSubplot:>]],
      dtype=object)
In [11]:
sns.pairplot(data1)
Out[11]:
<seaborn.axisgrid.PairGrid at 0x2222b4b08b0>
In [12]:
#Removing gender, customerID,tenture they are not usefull
col = ['gender','customerID','tenure']
data1 = data1.drop(col,axis = 1)
sns.pairplot(data1)
Out[12]:
<seaborn.axisgrid.PairGrid at 0x2222b4fbe80>
In [13]:
data1['TotalCharges'].notnull().sum()
Out[13]:
7043
In [14]:
data1['MonthlyCharges'].describe()
Out[14]:
count    7043.000000
mean       64.761692
std        30.090047
min        18.250000
25%        35.500000
50%        70.350000
75%        89.850000
max       118.750000
Name: MonthlyCharges, dtype: float64
In [15]:
data1['TotalCharges'].describe()
#the datatype of the TotalCharges is Object so we will changec that
Out[15]:
count     7043
unique    6531
top           
freq        11
Name: TotalCharges, dtype: object
In [16]:
#due to string(" ") at 488 position you can not change the TotalCharges into Int
#so we will be removing/replacing that string which is --> " "
data1['TotalCharges'] = data1['TotalCharges'].replace(" ",np.nan)
data1['TotalCharges'] = pd.to_numeric(data1['TotalCharges'], errors = 'coerce') # coerce will replace all the non-numeric values with NaN
#dropping all the rows in which there is a null value
data1 = data1.dropna(how = "any", axis = 0) #removing all the rows which have null value in it
data1['TotalCharges'].describe()
Out[16]:
count    7032.000000
mean     2283.300441
std      2266.771362
min        18.800000
25%       401.450000
50%      1397.475000
75%      3794.737500
max      8684.800000
Name: TotalCharges, dtype: float64
In [17]:
data1.notnull().sum()
Out[17]:
SeniorCitizen       7032
Partner             7032
Dependents          7032
PhoneService        7032
MultipleLines       7032
InternetService     7032
OnlineSecurity      7032
OnlineBackup        7032
DeviceProtection    7032
TechSupport         7032
StreamingTV         7032
StreamingMovies     7032
Contract            7032
PaperlessBilling    7032
PaymentMethod       7032
MonthlyCharges      7032
TotalCharges        7032
Churn               7032
dtype: int64

EDA (EXPLORATORY DATA ANALYSIS)¶

In [18]:
data1['Churn'].describe()
Out[18]:
count     7032
unique       2
top         No
freq      5163
Name: Churn, dtype: object
In [19]:
for i, predictor in enumerate(data1.drop(columns=['Churn', 'TotalCharges', 'MonthlyCharges'])):
    ax = sns.countplot(data = data1, x = predictor, hue='Churn')
    if predictor == "PaymentMethod": 
        ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
        plt.tight_layout()
        plt.show()
    else:
        plt.tight_layout()
        plt.show()
In [20]:
#converting Yes as 1 and No as 0
data1["Churn"] = data1["Churn"].replace(['Yes','No'],[1,0])
C:\Users\Nupur\AppData\Local\Temp\ipykernel_19184\151837023.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [21]:
data1
Out[21]:
SeniorCitizen Partner Dependents PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 0 Yes No No No phone service DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 0
1 0 No No Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.50 0
2 0 No No Yes No DSL Yes Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 1
3 0 No No No No phone service DSL Yes No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 0
4 0 No No Yes No Fiber optic No No No No No No Month-to-month Yes Electronic check 70.70 151.65 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 0 Yes Yes Yes Yes DSL Yes No Yes Yes Yes Yes One year Yes Mailed check 84.80 1990.50 0
7039 0 Yes Yes Yes Yes Fiber optic No Yes Yes No Yes Yes One year Yes Credit card (automatic) 103.20 7362.90 0
7040 0 Yes Yes No No phone service DSL Yes No No No No No Month-to-month Yes Electronic check 29.60 346.45 0
7041 1 Yes No Yes Yes Fiber optic No No No No No No Month-to-month Yes Mailed check 74.40 306.60 1
7042 0 No No Yes No Fiber optic Yes No Yes Yes Yes Yes Two year Yes Bank transfer (automatic) 105.65 6844.50 0

7032 rows × 18 columns

In [22]:
telco_data_dummies = pd.get_dummies(data1)
In [23]:
telco_data_dummies
Out[23]:
SeniorCitizen MonthlyCharges TotalCharges Churn Partner_No Partner_Yes Dependents_No Dependents_Yes PhoneService_No PhoneService_Yes ... StreamingMovies_Yes Contract_Month-to-month Contract_One year Contract_Two year PaperlessBilling_No PaperlessBilling_Yes PaymentMethod_Bank transfer (automatic) PaymentMethod_Credit card (automatic) PaymentMethod_Electronic check PaymentMethod_Mailed check
0 0 29.85 29.85 0 0 1 1 0 1 0 ... 0 1 0 0 0 1 0 0 1 0
1 0 56.95 1889.50 0 1 0 1 0 0 1 ... 0 0 1 0 1 0 0 0 0 1
2 0 53.85 108.15 1 1 0 1 0 0 1 ... 0 1 0 0 0 1 0 0 0 1
3 0 42.30 1840.75 0 1 0 1 0 1 0 ... 0 0 1 0 1 0 1 0 0 0
4 0 70.70 151.65 1 1 0 1 0 0 1 ... 0 1 0 0 0 1 0 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 0 84.80 1990.50 0 0 1 0 1 0 1 ... 1 0 1 0 0 1 0 0 0 1
7039 0 103.20 7362.90 0 0 1 0 1 0 1 ... 1 0 1 0 0 1 0 1 0 0
7040 0 29.60 346.45 0 0 1 0 1 1 0 ... 0 1 0 0 0 1 0 0 1 0
7041 1 74.40 306.60 1 0 1 1 0 0 1 ... 0 1 0 0 0 1 0 0 0 1
7042 0 105.65 6844.50 0 1 0 1 0 0 1 ... 1 0 0 1 0 1 1 0 0 0

7032 rows × 43 columns

In [24]:
churn_corr_matrix = telco_data_dummies.corr()
In [25]:
churn_corr_matrix['Churn'].sort_values(ascending = False).plot(kind='bar',figsize = (15,10))
Out[25]:
<AxesSubplot:>

HIGH Churn seen in case of monthly contracts, no online security, no technical support, first year subscription and fiber optic Internet;

LOW Churn is seen in the case of long-term contracts, subscriptions without internet service and customers contracted for more than 5 years;

Factors such as gender, phone service availability, and number of multiple lines have almost NO impact on churn!

In [26]:
churn_corr_matrix['Churn'].sort_values(ascending = False)
Out[26]:
Churn                                      1.000000
Contract_Month-to-month                    0.404565
OnlineSecurity_No                          0.342235
TechSupport_No                             0.336877
InternetService_Fiber optic                0.307463
PaymentMethod_Electronic check             0.301455
OnlineBackup_No                            0.267595
DeviceProtection_No                        0.252056
MonthlyCharges                             0.192858
PaperlessBilling_Yes                       0.191454
Dependents_No                              0.163128
SeniorCitizen                              0.150541
Partner_No                                 0.149982
StreamingMovies_No                         0.130920
StreamingTV_No                             0.128435
StreamingTV_Yes                            0.063254
StreamingMovies_Yes                        0.060860
MultipleLines_Yes                          0.040033
PhoneService_Yes                           0.011691
PhoneService_No                           -0.011691
MultipleLines_No phone service            -0.011691
MultipleLines_No                          -0.032654
DeviceProtection_Yes                      -0.066193
OnlineBackup_Yes                          -0.082307
PaymentMethod_Mailed check                -0.090773
PaymentMethod_Bank transfer (automatic)   -0.118136
InternetService_DSL                       -0.124141
PaymentMethod_Credit card (automatic)     -0.134687
Partner_Yes                               -0.149982
Dependents_Yes                            -0.163128
TechSupport_Yes                           -0.164716
OnlineSecurity_Yes                        -0.171270
Contract_One year                         -0.178225
PaperlessBilling_No                       -0.191454
TotalCharges                              -0.199484
StreamingTV_No internet service           -0.227578
OnlineSecurity_No internet service        -0.227578
InternetService_No                        -0.227578
StreamingMovies_No internet service       -0.227578
OnlineBackup_No internet service          -0.227578
TechSupport_No internet service           -0.227578
DeviceProtection_No internet service      -0.227578
Contract_Two year                         -0.301552
Name: Churn, dtype: float64
In [27]:
x = telco_data_dummies.drop('Churn',axis = 1)
x
Out[27]:
SeniorCitizen MonthlyCharges TotalCharges Partner_No Partner_Yes Dependents_No Dependents_Yes PhoneService_No PhoneService_Yes MultipleLines_No ... StreamingMovies_Yes Contract_Month-to-month Contract_One year Contract_Two year PaperlessBilling_No PaperlessBilling_Yes PaymentMethod_Bank transfer (automatic) PaymentMethod_Credit card (automatic) PaymentMethod_Electronic check PaymentMethod_Mailed check
0 0 29.85 29.85 0 1 1 0 1 0 0 ... 0 1 0 0 0 1 0 0 1 0
1 0 56.95 1889.50 1 0 1 0 0 1 1 ... 0 0 1 0 1 0 0 0 0 1
2 0 53.85 108.15 1 0 1 0 0 1 1 ... 0 1 0 0 0 1 0 0 0 1
3 0 42.30 1840.75 1 0 1 0 1 0 0 ... 0 0 1 0 1 0 1 0 0 0
4 0 70.70 151.65 1 0 1 0 0 1 1 ... 0 1 0 0 0 1 0 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 0 84.80 1990.50 0 1 0 1 0 1 0 ... 1 0 1 0 0 1 0 0 0 1
7039 0 103.20 7362.90 0 1 0 1 0 1 0 ... 1 0 1 0 0 1 0 1 0 0
7040 0 29.60 346.45 0 1 0 1 1 0 0 ... 0 1 0 0 0 1 0 0 1 0
7041 1 74.40 306.60 0 1 1 0 0 1 0 ... 0 1 0 0 0 1 0 0 0 1
7042 0 105.65 6844.50 1 0 1 0 0 1 1 ... 1 0 0 1 0 1 1 0 0 0

7032 rows × 42 columns

In [28]:
y = telco_data_dummies['Churn']
y
Out[28]:
0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7032, dtype: int64
In [29]:
x.shape
Out[29]:
(7032, 42)
In [30]:
y.shape
Out[30]:
(7032,)
In [31]:
y.value_counts()
Out[31]:
0    5163
1    1869
Name: Churn, dtype: int64

VARIABLE IMBALANCING¶

In [32]:
from imblearn.over_sampling import SMOTE
In [33]:
smote = SMOTE(random_state=0)
In [34]:
x_resampled_smote, y_resampled_smote = smote.fit_resample(x,y)
In [35]:
y_resampled_smote.value_counts()
Out[35]:
0    5163
1    5163
Name: Churn, dtype: int64
In [36]:
x_resampled_smote
Out[36]:
SeniorCitizen MonthlyCharges TotalCharges Partner_No Partner_Yes Dependents_No Dependents_Yes PhoneService_No PhoneService_Yes MultipleLines_No ... StreamingMovies_Yes Contract_Month-to-month Contract_One year Contract_Two year PaperlessBilling_No PaperlessBilling_Yes PaymentMethod_Bank transfer (automatic) PaymentMethod_Credit card (automatic) PaymentMethod_Electronic check PaymentMethod_Mailed check
0 0 29.850000 29.850000 0 1 1 0 1 0 0 ... 0 1 0 0 0 1 0 0 1 0
1 0 56.950000 1889.500000 1 0 1 0 0 1 1 ... 0 0 1 0 1 0 0 0 0 1
2 0 53.850000 108.150000 1 0 1 0 0 1 1 ... 0 1 0 0 0 1 0 0 0 1
3 0 42.300000 1840.750000 1 0 1 0 1 0 0 ... 0 0 1 0 1 0 1 0 0 0
4 0 70.700000 151.650000 1 0 1 0 0 1 1 ... 0 1 0 0 0 1 0 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10321 0 103.976753 242.804921 0 1 1 0 0 1 0 ... 1 1 0 0 0 1 0 0 0 0
10322 0 35.824447 35.824447 1 0 1 0 1 0 0 ... 1 1 0 0 0 0 0 0 1 0
10323 0 44.493077 1061.960339 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
10324 0 19.363055 19.363055 1 0 1 0 0 1 1 ... 0 1 0 0 0 1 0 0 0 1
10325 0 96.922890 96.922890 1 0 1 0 0 1 0 ... 1 1 0 0 0 0 0 0 0 0

10326 rows × 42 columns

In [37]:
y_resampled_smote.notnull().sum()
Out[37]:
10326
In [38]:
x_resampled_smote.notnull().sum()
Out[38]:
SeniorCitizen                              10326
MonthlyCharges                             10326
TotalCharges                               10326
Partner_No                                 10326
Partner_Yes                                10326
Dependents_No                              10326
Dependents_Yes                             10326
PhoneService_No                            10326
PhoneService_Yes                           10326
MultipleLines_No                           10326
MultipleLines_No phone service             10326
MultipleLines_Yes                          10326
InternetService_DSL                        10326
InternetService_Fiber optic                10326
InternetService_No                         10326
OnlineSecurity_No                          10326
OnlineSecurity_No internet service         10326
OnlineSecurity_Yes                         10326
OnlineBackup_No                            10326
OnlineBackup_No internet service           10326
OnlineBackup_Yes                           10326
DeviceProtection_No                        10326
DeviceProtection_No internet service       10326
DeviceProtection_Yes                       10326
TechSupport_No                             10326
TechSupport_No internet service            10326
TechSupport_Yes                            10326
StreamingTV_No                             10326
StreamingTV_No internet service            10326
StreamingTV_Yes                            10326
StreamingMovies_No                         10326
StreamingMovies_No internet service        10326
StreamingMovies_Yes                        10326
Contract_Month-to-month                    10326
Contract_One year                          10326
Contract_Two year                          10326
PaperlessBilling_No                        10326
PaperlessBilling_Yes                       10326
PaymentMethod_Bank transfer (automatic)    10326
PaymentMethod_Credit card (automatic)      10326
PaymentMethod_Electronic check             10326
PaymentMethod_Mailed check                 10326
dtype: int64
In [39]:
from sklearn.linear_model import LogisticRegression
In [40]:
#checking on imbalance data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
In [41]:
LogReg = LogisticRegression(solver='lbfgs', max_iter=400)
LogReg.fit(x_train,y_train)
Out[41]:
LogisticRegression(max_iter=400)
In [42]:
y_pred = LogReg.predict(x_test)
In [43]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)
Out[43]:
0.7867803837953091
In [44]:
#checking on balanced data
x_smote_train,x_smote_test,y_smote_train,y_smote_test = train_test_split(x_resampled_smote,y_resampled_smote,test_size=0.2,random_state=42)
LogReg.fit(x_smote_train,y_smote_train)
C:\Users\Nupur\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Out[44]:
LogisticRegression(max_iter=400)
In [45]:
y_smote_pred = LogReg.predict(x_smote_test)
accuracy_score(y_smote_test,y_smote_pred)
Out[45]:
0.8325266214908035